In this script, we investigate the behavioral pattern of the cell phase scores for the 5 different cell cycle phases for the single cell seq data collected by Yoav’s lab.

For more information on cell cycle definition, go to github.com/singleCell-seq/cell-cycle

setwd('/Users/kushal/Documents/singleCell-method/project/analysis/')
library(data.table)
library(maptpx)
library(gplots)
library(philentropy) # github: HajkD/philentropy
library(dplyr)
library(edgeR)
library(qtlcharts)
library(CountClust)

Prepare data

Import raw count data.

reads <- data.frame(fread('../data/reads.txt'), row.names=1)
molecules <-  data.frame(fread('../data/molecules.txt'), row.names=1)

Quality single cell index.

quality_single_cells <- scan("../data/quality-single-cells.txt",
                             what = "character")
anno <- data.frame(fread('../data/annotation.txt'))

Keep only quality single cells. Remove bulk gene expression data.

molecules <- molecules[, grepl("bulk", colnames(molecules)) |
                         colnames(molecules) %in% quality_single_cells]
anno <- anno[anno$well == "bulk" | anno$sample_id %in% quality_single_cells, ]
stopifnot(ncol(molecules) == nrow(anno),
          colnames(molecules) == anno$sample_id)

reads <- reads[, grepl("bulk", colnames(reads)) |
                         colnames(reads) %in% quality_single_cells]
stopifnot(ncol(reads) == nrow(anno),
          colnames(reads) == anno$sample_id)

Exclude genes with a sum of 0 count across quality single cells.

expressed <- rowSums(molecules[, anno$well == "bulk"]) > 0 &
             rowSums(molecules[, anno$well != "bulk"]) > 0
molecules <- molecules[expressed, ]

expressed <- rowSums(reads[, anno$well == "bulk"]) > 0 &
             rowSums(reads[, anno$well != "bulk"]) > 0
reads <- reads[expressed, ]

molecules_single <- molecules %>% select(-contains("bulk"))
reads_single <- reads %>% select(-contains("bulk"))

Remove genes with max molecule numer larger than 1024

molecules_single <- molecules_single[apply(molecules_single,1,max) < 1024,];

Now we draw a list of marker genes that have cell cycle information.

cell_cycle_genes <- read.table("../data/cellcyclegenes.txt", header = TRUE, sep="\t")

## create 5 lists of 5 phases (de-level and then remove "")
cell_cycle_genes_list <- lapply(1:5,function(x){
  temp <- as.character(cell_cycle_genes[,x])
  temp[temp!=""]
})

labs <- unique(unlist(lapply(1:5, function(k) 
                            match(cell_cycle_genes_list[[k]],
                            rownames(molecules_single)))) )
labs <- labs[!is.na(labs)]
molecules_single_cell_cycle <- molecules_single[labs,]

Batch Corrected data

cpm_data <- voom(t(molecules_single[labs,]))$E
individual_id <- sapply(1:length(rownames(cpm_data)), function(x) 
                        strsplit(rownames(cpm_data)[x],"[.]")[[1]][1])

batch_id <- sapply(1:length(rownames(cpm_data)), function(x)
                    strsplit(rownames(cpm_data)[x],"[.]")[[1]][2]);

individual.batch.id <- paste0(individual_id, "_", batch_id);


molecules_single <- t(BatchCorrectedCounts(t(molecules_single),individual.batch.id,use_parallel=TRUE));


reads_single <- t(BatchCorrectedCounts(t(reads_single),individual.batch.id,use_parallel=TRUE));

molecules_single_cell_cycle <- molecules_single[labs,];

Gene cpm profile

Cell cycle phase genes

We explore these genes in greater detail. How do the iplotCurves look like for the single cells for the cell cylce genes.

# Helper function for plotting curves
iplotCurves_cell_cycle <- function(data,genes_list)
{
  ## data is assumed to be a genes-by-samples matrix, same as molecules_single
  labs <- match(genes_list, rownames(data));
  temp_data <- data[labs,];
  iplotCurves(temp_data);
}

Using the reordering technique of Macosko et al to order the cells as per their position in the cell cycle.

batch_removed_cpm <- voom(t(molecules_single_cell_cycle))$E;
ordered_cells <- as.vector(as.matrix(read.table("../data/ipsc_ordered_cells.txt")));

indices <- match(ordered_cells, colnames(molecules_single_cell_cycle));
batch_removed_cpm <- batch_removed_cpm[indices,];

sample gene cell cycle genes corresponding to first phase.

iplotCurves_cell_cycle(t(batch_removed_cpm),cell_cycle_genes_list[[1]])
## Set screen size to height=700 x width=1000

The iplotCurves for cell cycle genes corresponding to second phase.

iplotCurves_cell_cycle(t(batch_removed_cpm),cell_cycle_genes_list[[2]])

The iplotCurves for cell cycle genes corresponding to third phase.

iplotCurves_cell_cycle(t(batch_removed_cpm),cell_cycle_genes_list[[3]])

The iplotCurves for cell cycle genes corresponding to fourth phase.

iplotCurves_cell_cycle(t(batch_removed_cpm),cell_cycle_genes_list[[4]])

The iplotCurves for cell cycle genes corresponding to fifth phase.

iplotCurves_cell_cycle(t(batch_removed_cpm),cell_cycle_genes_list[[5]])

Not all the cell cycle genes seem to be informative. So, we pick the cell cycle genes that get picked up by the admixture method to be cluster driving. We try to see the patterns of expression of these cluster driving genes and see if they are indeed sinusoidal.

Admixture k=3 top cluster driving genes

ipsc_topics_cellcycle_batchcorrect <- get(load("../../project/rdas/topic_fit_ipsc_batchcorrect_cellcycle.rda"));
topics_clust <- ipsc_topics_cellcycle_batchcorrect[[2]]
clust <- topics_clust$K
theta <- topics_clust$theta
features <- ExtractTopFeatures(theta,top_features=50,method="poisson")

features_vec <- unique(as.vector(features));

class <- as.numeric(apply(theta[features_vec,], 1, which.max))
imp_gene_names <- colnames(batch_removed_cpm[,features_vec]);
iplotCurves_cell_cycle(t(batch_removed_cpm),imp_gene_names)

Cell-cycle scores

We compute cell cycle scores for the cell cycle genes for every single cell.

ans <- sapply(cell_cycle_genes_list,function(xx){
      
      #### create table of each phase
      reads_single_phase <- reads_single[rownames(reads_single) %in% unlist(xx) ,]
      #### add average expression of all genes in the phase
      combined_matrix <- rbind(reads_single_phase, 
                             average=apply(reads_single_phase,2,mean))
      #### use transpose to compute cor matrix
      cor_matrix <- cor(t(combined_matrix))
      #### take the numbers
      cor_vector <- cor_matrix[,dim(cor_matrix)[1]]
      #### restrict to correlation >= 0.3 
      reads_single_phase_restricted <- reads_single_phase[rownames(reads_single_phase) %in% names(cor_vector[cor_vector >= 0.3]),]
      #### apply normalization to reads
      norm_factors_single <- calcNormFactors(reads_single_phase_restricted, method = "TMM")
      reads_single_cpm <- cpm(reads_single_phase_restricted, log = TRUE,
                                lib.size = colSums(reads_single) * norm_factors_single)
      #### output the phase specific scores (mean of normalized expression levels in the phase)
      apply(reads_single_cpm,2,mean)
})

We make a iplotCurves plot for the cell phase scores of the 578 single cells.

iplotCurves(ans)

Now we apply a 2-step normalization on the phase scores.

#### normalization function
flexible_normalization <- function(data_in,by_row=TRUE){
  if(by_row){
    row_mean <- apply(data_in,1,mean)
    row_sd   <- apply(data_in,1,sd)
    output <- data_in
    for(i in 1:dim(data_in)[1]){
      output[i,] <- (data_in[i,] - row_mean[i])/row_sd[i]
    }
  }
  #### if by column
  if(!by_row){
    col_mean <- apply(data_in,2,mean)
    col_sd   <- apply(data_in,2,sd)
    output <- data_in
    for(i in 1:dim(data_in)[2]){
      output[,i] <- (data_in[,i] - col_mean[i])/col_sd[i]
    }
  }
  output
}

#### apply the normalization function
## first normalized for each phase
ans_normed <- flexible_normalization(ans,by_row=FALSE)
## then normalized of each cell
ans_normed_normed <- flexible_normalization(ans_normed,by_row=TRUE)

The iplotCurves plot after the 2-way normalization.

iplotCurves(ans_normed_normed)

Assign the single cells to the cell phase

cell_phase <- apply(ans_normed_normed,1,function(x) colnames(cell_cycle_genes)[which.max(x)])
assign_cell_phase <- data.frame(cell_phase)
cell_phase_vector <- as.vector(as.matrix(assign_cell_phase));
cell_phase_vector <- factor(cell_phase_vector, 
                            levels = c("G1.S", "S", "G2.M", "M", "M.G1"))